data.orig <- read_xls(path='default of credit card clients.xls',
                      sheet='Data',  skip=1)
data <- data.frame(data.orig)
colnames(data) <- c('id', 'credit_limit', 'gender', 'education', 'marital', 'age', 'rs_sep', 'rs_aug', 'rs_july', 'rs_june', 'rs_may', 'rs_apr', 'bs_sep', 'bs_aug', 'bs_july', 'bs_june', 'bs_may', 'bs_apr', 'ap_sep', 'ap_aug', 'ap_july', 'ap_june', 'ap_may', 'ap_apr', 'default_r')
data <- data[data$marital!=0,]
# Marital status (1 = married; 2 = single; 3 = others)
data[data$marital == 2, 'marital'] <- 'Single'
data[data$marital == 1, 'marital'] <- 'Married'
data[data$marital == 3, 'marital'] <- 'Others'
marital.levels <- c("Single", "Married", "Others")
data$marital <- factor(data$marital, levels = marital.levels)

data[data$education >= 4 | data$education ==0, 'education'] <- 4
# Education (1 = graduate school; 2 = university; 3 = high school; 4 = others)
data[data$education==1, 'education'] <- 'Graduate School'
data[data$education==2, 'education'] <- 'University'
data[data$education==3, 'education'] <- 'High School'
data[data$education==4, 'education'] <- 'Others'
education.levels <- c("Others", "High School", "Graduate School", "University")
data$education <- factor(data$education, levels=education.levels)

# Gender (1 = male; 2 = female)
data[data$gender==1, 'gender'] <- 'Male'
data[data$gender==2, 'gender'] <- 'Female'
gender_levels <- c("Male", "Female")
data$gender <- factor(data$gender, levels=gender_levels)

data$rs_sep <- factor(data$rs_sep)
data$rs_aug <- factor(data$rs_aug)
data$rs_july <- factor(data$rs_july)
data$rs_june <- factor(data$rs_june)
data$rs_may <- factor(data$rs_may)
data$rs_apr <- factor(data$rs_apr)
data[data$default_r==1, 'default_r'] <- 'Yes'
data[data$default_r==0, 'default_r'] <- 'No'
default_levels <- c("No", "Yes")
data$default_r <- factor(data$default_r, levels=default_levels)

1 Introduction

In recent years, the credit card issuers in Taiwan faced the cash and credit card debt crisis and the delinquency is expected to peak in the third quarter of 2006 (Chou, 2006). In order to increase market share, card-issuing banks in Taiwan over-issued cash and credit cards to unqualified applicants. At the same time, most cardholders, irrespective of their repayment ability, overused credit card for consumption and accumulated heavy credit and cash– card debts. The crisis caused the blow to consumer finance confidence and it is a big challenge for both banks and cardholders.

2 Data

2.1 Description

str(data)
## 'data.frame':    29946 obs. of  25 variables:
##  $ id          : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ credit_limit: num  20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
##  $ gender      : Factor w/ 2 levels "Male","Female": 2 2 2 2 1 1 1 2 2 1 ...
##  $ education   : Factor w/ 4 levels "Others","High School",..: 4 4 4 4 4 3 3 4 2 2 ...
##  $ marital     : Factor w/ 3 levels "Single","Married",..: 2 1 1 2 2 1 1 1 2 1 ...
##  $ age         : num  24 26 34 37 57 37 29 23 28 35 ...
##  $ rs_sep      : Factor w/ 11 levels "-2","-1","0",..: 5 2 3 3 2 3 3 3 3 1 ...
##  $ rs_aug      : Factor w/ 11 levels "-2","-1","0",..: 5 5 3 3 3 3 3 2 3 1 ...
##  $ rs_july     : Factor w/ 11 levels "-2","-1","0",..: 2 3 3 3 2 3 3 2 5 1 ...
##  $ rs_june     : Factor w/ 11 levels "-2","-1","0",..: 2 3 3 3 3 3 3 3 3 1 ...
##  $ rs_may      : Factor w/ 10 levels "-2","-1","0",..: 1 3 3 3 3 3 3 3 3 2 ...
##  $ rs_apr      : Factor w/ 10 levels "-2","-1","0",..: 1 4 3 3 3 3 3 2 3 2 ...
##  $ bs_sep      : num  3913 2682 29239 46990 8617 ...
##  $ bs_aug      : num  3102 1725 14027 48233 5670 ...
##  $ bs_july     : num  689 2682 13559 49291 35835 ...
##  $ bs_june     : num  0 3272 14331 28314 20940 ...
##  $ bs_may      : num  0 3455 14948 28959 19146 ...
##  $ bs_apr      : num  0 3261 15549 29547 19131 ...
##  $ ap_sep      : num  0 0 1518 2000 2000 ...
##  $ ap_aug      : num  689 1000 1500 2019 36681 ...
##  $ ap_july     : num  0 1000 1000 1200 10000 657 38000 0 432 0 ...
##  $ ap_june     : num  0 1000 1000 1100 9000 ...
##  $ ap_may      : num  0 0 1000 1069 689 ...
##  $ ap_apr      : num  0 2000 5000 1000 679 ...
##  $ default_r   : Factor w/ 2 levels "No","Yes": 2 2 1 1 1 1 1 1 1 1 ...

2.2 Overview

rmarkdown::paged_table(data)

2.3 Distribution of Imbalanced data

hist.default <- ggplot(data, aes(x=default_r)) + 
                geom_bar() +
                xlab('Credit Card Default') +
                ggtitle('Imbalanced distribution of response variable')  
bxpt.default <- ggplot(data, aes(x=default_r, y=credit_limit)) +
                geom_boxplot(outlier.color = 'red', outlier.shape = 8, fill='gray') +
                stat_boxplot(geom='errorbar', width=0.5) + xlab('Default')

grid.arrange(hist.default, bxpt.default, nrow=1)

data <- SMOTE(default_r~., data=data, perc.over=280)

2.4 Distribution post SMOTE

hist.default <- ggplot(data, aes(x=default_r)) + 
                geom_bar() +
                xlab('Credit Card Default') +
                ggtitle('Imbalanced distribution of response variable')  
bxpt.default <- ggplot(data, aes(x=default_r, y=credit_limit)) +
                geom_boxplot(outlier.color = 'red', outlier.shape = 8, fill='gray') +
                stat_boxplot(geom='errorbar', width=0.5) + xlab('Default')

grid.arrange(hist.default, bxpt.default, nrow=1)

# H0: Credit limit of those who default is less than that of who pay promptly
# H1: Credit limit of those who default is higher than that of who pay promptly
ttest.credit_limit <- t.test(data[data$default_r=="Yes", 'credit_limit'], 
                             mu=mean(data[data$default_r=="No", 'credit_limit']),
                             alternative='greater')
ttest.credit_limit
## 
##  One Sample t-test
## 
## data:  data[data$default_r == "Yes", "credit_limit"]
## t = -70.654, df = 19892, p-value = 1
## alternative hypothesis: true mean is greater than 179123.7
## 95 percent confidence interval:
##  124308.7      Inf
## sample estimates:
## mean of x 
##  125555.8

2.5 Age and Default

hist.age <- ggplot(data, aes(x=age)) +
            geom_histogram(color='white') +
            geom_vline(xintercept=mean(data$age), color='blue', lwd=1.6) +
            geom_vline(xintercept=median(data$age), color='red', lwd=1.6, 
                       linetype='dashed')
            ggtitle('Age distribution')
## $title
## [1] "Age distribution"
## 
## attr(,"class")
## [1] "labels"
bxpt.age <- ggplot(data, aes(x=default_r, y=age)) + 
            geom_boxplot(outlier.color = 'red', outlier.shape = 8, 
                         fill='gray') +
            stat_boxplot(geom='errorbar', width=0.5) + xlab('Default')

grid.arrange(hist.age, bxpt.age, nrow=1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ttest.age_default <- t.test(data[data$default_r=="Yes", 'age'], 
                            data[data$default_r=="No", 'age'])
ttest.age_default
## 
##  Welch Two Sample t-test
## 
## data:  data[data$default_r == "Yes", "age"] and data[data$default_r == "No", "age"]
## t = 3.8077, df = 43272, p-value = 0.0001405
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.1565759 0.4887748
## sample estimates:
## mean of x mean of y 
##  35.74497  35.42230

The t.test on age between those defaulted on their credit card bills and those who did not, returned a p.value of 1.404827^{-4}. In simple words, the age does have a connection on those who default.

# Group them so we can have an overall understanding of at which age the default
# happens
data$age_group = 0
data[data$age <= 35, "age_group"] <- "Young"
data[data$age >35 & data$age <= 50, "age_group"] <- "Young Adults"
data[data$age >50 & data$age < 66, "age_group"] <- "Mid-age Adults"
data[data$age >= 66, "age_group"] <- "Seniors"

data$age_group <- factor(data$age_group, levels=c("Young", "Young Adults", 
                                                  "Mid-age Adults", "Seniors"))

age_group_default_cont <- table(data$default_r, data$age_group)
chisq.age_default <- chisq.test(age_group_default_cont)

age_group_default_cont <- data.frame(age_group_default_cont)
colnames(age_group_default_cont) <- c("Default", "Age_Group", "Freq")


bxpt.age_group_default <- ggplot(data, aes(x=age_group, y=age)) +
                          geom_boxplot(outlier.color='red', outlier.shape=8,
                                       fill='gray') +
                          stat_boxplot(geom='errorbar', width=0.5)

bar.age_group_default <- ggplot(age_group_default_cont, aes(x=Age_Group, y=Freq, fill=Default)) +
                         geom_bar(stat='identity', position='identity', alpha=0.55)

grid.arrange(bxpt.age_group_default, bar.age_group_default, nrow=1)

# tmp_data <- data %>% group_by(age_group) %>% summarise(default_count = count(default_r))

chisq.age_default
## 
##  Pearson's Chi-squared test
## 
## data:  age_group_default_cont
## X-squared = 52.064, df = 3, p-value = 2.903e-11

2.6 Education and Default

educ_default_cont <- table(data$default, data$education)
chisq.educ_default <- chisq.test(educ_default_cont)

educ_default_cont <- data.frame(educ_default_cont)
colnames(educ_default_cont) <- c("Default", "Education", "Freq")

hist.educ <- ggplot(educ_default_cont, aes(x=Education, y=Freq, fill=Default)) + 
             geom_bar(stat='identity', position='identity', alpha=0.65)

hist.educ

chisq.educ_default
## 
##  Pearson's Chi-squared test
## 
## data:  educ_default_cont
## X-squared = 557.57, df = 3, p-value < 2.2e-16

2.7 Marital and Default

# Marital status (1 = married; 2 = single; 3 = others)
# Remove rows that have marital 0 since it is not given any meaning

marital_default_cont <- table(data$default_r, data$marital)
chisq.marital_default <- chisq.test(marital_default_cont)

marital_default_cont <- data.frame(marital_default_cont)
colnames(marital_default_cont) <- c("Default", "Marital", "Freq")

ggplot(marital_default_cont, aes(x=Marital, y=Freq, fill=Default)) + 
geom_bar(stat='identity', position='identity', alpha=0.65) + xlab('Marital')

# marital_default_cont

chisq.marital_default
## 
##  Pearson's Chi-squared test
## 
## data:  marital_default_cont
## X-squared = 142.61, df = 2, p-value < 2.2e-16

2.8 Gender and default

gender_default_cont <- table(data$gender, data$default_r)
chisq.gender_default <- chisq.test(gender_default_cont)
gender_default_cont <- data.frame(gender_default_cont)
colnames(gender_default_cont) <- c("Gender", "Default", "Freq")

hist.gender_default <- ggplot(gender_default_cont, aes(x=Gender, y=Freq, fill=Default)) +
                        geom_bar(stat='identity', position='identity', alpha=0.65)
hist.gender_default

chisq.gender_default
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  gender_default_cont
## X-squared = 442.19, df = 1, p-value < 2.2e-16
sep_df <- ggplot(data[data$default_r=='Yes',], aes(x=rs_sep, y=)) + 
          geom_bar(aes(y = (..count..)/sum(..count..))) + ylab('Density') +
          ylim(c(0, 0.65)) +
           ggtitle('Sep Default RP')

sep_ndf <- ggplot(data[data$default_r=='No',], aes(x=rs_sep)) + ylab('Density') +
           geom_bar(aes(y = (..count..)/sum(..count..))) + 
            ylim(c(0, 0.65)) +
            ggtitle('Sep Non-Default RP')

aug_df <- ggplot(data[data$default_r=='Yes',], aes(x=rs_aug)) + ylab('Density') +
           geom_bar(aes(y = (..count..)/sum(..count..))) + 
           ylim(c(0, 0.65)) + ggtitle('Aug Default RP')

aug_ndf <- ggplot(data[data$default_r=='No',], aes(x=rs_aug)) + ylab('Density') +
           geom_bar(aes(y = (..count..)/sum(..count..))) +  
           ylim(c(0, 0.65)) + ggtitle('Aug Non-Default RP')

july_df <- ggplot(data[data$default_r=='Yes',], aes(x=rs_july)) + ylab('Density') +
           geom_bar(aes(y = (..count..)/sum(..count..))) + 
           ylim(c(0, 0.65)) + ggtitle('July Default RP')

july_ndf <- ggplot(data[data$default_r=='No',], aes(x=rs_july)) + ylab('Density') +
            geom_bar(aes(y = (..count..)/sum(..count..))) + 
           ylim(c(0, 0.65)) +  ggtitle('July Non-Default RP')

june_df <- ggplot(data[data$default_r=='Yes',], aes(x=rs_june)) + ylab('Density') +
           geom_bar(aes(y = (..count..)/sum(..count..))) +  
           ylim(c(0, 0.65)) + ggtitle('June Default RP')

june_ndf <- ggplot(data[data$default_r=='No',], aes(x=rs_june)) + ylab('Density') +
            geom_bar(aes(y = (..count..)/sum(..count..))) +  
            ylim(c(0, 0.65)) + ggtitle('June Non-Default RP')

may_df <- ggplot(data[data$default_r=='Yes',], aes(x=rs_may)) + ylab('Density') +
          geom_bar(aes(y = (..count..)/sum(..count..))) +  
          ylim(c(0, 0.65)) + ggtitle('May Default RP')

may_ndf <- ggplot(data[data$default_r=='No',], aes(x=rs_may)) + ylab('Density') +
           geom_bar(aes(y = (..count..)/sum(..count..))) +  
           ylim(c(0, 0.65)) + ggtitle('May Non-Default RP')

april_df <- ggplot(data[data$default_r=='Yes',], aes(x=rs_apr)) + ylab('Density') +
            geom_bar(aes(y = (..count..)/sum(..count..))) +  
            ylim(c(0, 0.65)) + ggtitle('April Default RP')

april_ndf <- ggplot(data[data$default_r=='No',], aes(x=rs_apr)) + ylab('Density') +
             geom_bar(aes(y = (..count..)/sum(..count..))) + 
             ylim(c(0, 0.65)) + ggtitle('April Non-Default RP')

grid.arrange(april_df, april_ndf, may_df, may_ndf, june_df, june_ndf, 
             july_df, july_ndf, aug_df, aug_ndf, sep_df, sep_ndf, nrow=6, ncol=2 )

library(dplyr)
ag_due_summary <-  data %>% 
                   dplyr::group_by(age_group) %>% 
                   dplyr::summarize(total_bill = median(bs_sep + bs_aug + bs_july + bs_june + bs_may + bs_apr),
                                    total_paid = median(ap_sep + ap_aug + ap_july + ap_june + ap_may + ap_apr),
                                    credit_limit = median(credit_limit))

ag_due_summary <- data.frame(ag_due_summary)
ag_due_summary$due <- ag_due_summary$total_bill - ag_due_summary$total_paid
ag_due_summary$due_percent <- (ag_due_summary$due/ag_due_summary$credit_limit) * 100

rmarkdown::paged_table(ag_due_summary)
med_due <- round(ag_due_summary$due_percent)
age_labels <- ag_due_summary$age_group
lbls <- paste(age_labels, ' ', med_due, '%')
pie(med_due, labels=lbls, col=rainbow(length(lbls)), main="Median Due by age groups")

getproportion <- function(x)
{
  ul <- unique(x)
  count <- NULL
  for(e in ul) # Hoping it produces in the same order as stored in ul
  {
    count <- c(count, sum(x==e))
  }
  count <- (count/sum(count)) * 100
  return(count)
  # m_c <- max(count, na.rm=T)
  # m_i <- which(count == m_c)
  # return(ul[m_i])
}
education.young <- getproportion(data[data$age_group=='Young', 'education'])
education.young_adults <- getproportion(data[data$age_group=='Young Adults', 'education'])
education.midage_adults <- getproportion(data[data$age_group=='Mid-age Adults', 'education'])
education.seniors <- getproportion(data[data$age_group=='Seniors', 'education'])

young.educ_perc <- round(education.young)
young_adults.educ_perc <- round(education.young_adults)
midage_adults.educ_perc <- round(education.midage_adults)
seniors.educ_perc <- round(education.seniors)

# Young
lbls <- paste(education.levels, ' ', young.educ_perc, '%')
pie(young.educ_perc, labels=lbls, col=rainbow(length(lbls)), main="Education level of Young")

# Young Adults
lbls <- paste(education.levels, ' ', young_adults.educ_perc, '%')
pie(young.educ_perc, labels=lbls, col=rainbow(length(lbls)), main="Education level of Young Adults")

# Mid-age Adults
lbls <- paste(education.levels, ' ', midage_adults.educ_perc, '%')
pie(midage_adults.educ_perc, labels=lbls, col=rainbow(length(lbls)), main="Education level of Mid-age adults")

# Seniors
lbls <- paste(education.levels, ' ', seniors.educ_perc, '%')
pie(seniors.educ_perc, labels=lbls, col=rainbow(length(lbls)), main="Education level of Seniors")

3 Modeling

full_model <- glm('default_r~.-id-age_group', data=data, family='binomial')
# full_model
summary(full_model)
## 
## Call:
## glm(formula = "default_r~.-id-age_group", family = "binomial", 
##     data = data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.5309  -0.7849  -0.5185   0.8487   4.0266  
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -1.201e+00  1.429e-01  -8.404  < 2e-16 ***
## credit_limit             -2.078e-06  1.220e-07 -17.030  < 2e-16 ***
## genderFemale             -3.632e-01  2.280e-02 -15.929  < 2e-16 ***
## educationHigh School      1.449e+00  1.305e-01  11.102  < 2e-16 ***
## educationGraduate School  1.281e+00  1.294e-01   9.907  < 2e-16 ***
## educationUniversity       1.229e+00  1.289e-01   9.537  < 2e-16 ***
## maritalMarried            1.727e-01  2.427e-02   7.113 1.14e-12 ***
## maritalOthers             5.250e-01  9.649e-02   5.441 5.29e-08 ***
## age                       2.136e-03  1.368e-03   1.562 0.118290    
## rs_sep-1                  3.904e-01  5.863e-02   6.658 2.77e-11 ***
## rs_sep0                  -8.737e-02  6.014e-02  -1.453 0.146314    
## rs_sep1                   5.235e-01  5.202e-02  10.063  < 2e-16 ***
## rs_sep2                   1.791e+00  6.537e-02  27.392  < 2e-16 ***
## rs_sep3                   1.709e+00  1.226e-01  13.943  < 2e-16 ***
## rs_sep4                   1.819e+00  2.358e-01   7.712 1.24e-14 ***
## rs_sep5                   1.062e+00  3.729e-01   2.848 0.004406 ** 
## rs_sep6                  -1.627e-01  5.308e-01  -0.307 0.759218    
## rs_sep7                   6.153e-01  5.825e-01   1.056 0.290836    
## rs_sep8                   1.699e+00  4.518e-01   3.759 0.000170 ***
## rs_aug-1                 -1.033e-01  5.546e-02  -1.862 0.062539 .  
## rs_aug0                  -3.091e-01  5.969e-02  -5.178 2.24e-07 ***
## rs_aug1                  -1.023e+00  3.900e-01  -2.624 0.008699 ** 
## rs_aug2                   4.131e-01  5.807e-02   7.114 1.13e-12 ***
## rs_aug3                   3.850e-01  1.142e-01   3.371 0.000748 ***
## rs_aug4                   3.126e-02  1.989e-01   0.157 0.875102    
## rs_aug5                  -3.951e-02  4.013e-01  -0.098 0.921573    
## rs_aug6                   8.819e-01  5.056e-01   1.744 0.081104 .  
## rs_aug7                   6.131e-01  4.685e-01   1.309 0.190613    
## rs_aug8                  -1.496e+01  3.247e+02  -0.046 0.963255    
## rs_july-1                -1.039e-01  5.392e-02  -1.928 0.053911 .  
## rs_july0                 -1.212e-01  5.566e-02  -2.178 0.029388 *  
## rs_july1                 -1.252e+01  1.700e+02  -0.074 0.941302    
## rs_july2                  4.004e-01  5.834e-02   6.863 6.73e-12 ***
## rs_july3                  4.978e-01  1.342e-01   3.708 0.000209 ***
## rs_july4                  3.820e-01  2.382e-01   1.604 0.108785    
## rs_july5                 -4.150e-02  4.385e-01  -0.095 0.924597    
## rs_july6                  9.590e-01  4.565e-01   2.101 0.035643 *  
## rs_july7                  1.140e+00  5.117e-01   2.229 0.025840 *  
## rs_july8                 -4.793e+00  1.423e+00  -3.368 0.000757 ***
## rs_june-1                -1.865e-01  5.293e-02  -3.523 0.000427 ***
## rs_june0                 -2.035e-01  5.272e-02  -3.859 0.000114 ***
## rs_june1                  1.392e+01  1.700e+02   0.082 0.934750    
## rs_june2                  4.349e-01  5.909e-02   7.361 1.82e-13 ***
## rs_june3                  5.669e-01  1.525e-01   3.718 0.000200 ***
## rs_june4                  7.078e-01  2.399e-01   2.950 0.003176 ** 
## rs_june5                  1.026e-01  3.324e-01   0.309 0.757669    
## rs_june6                 -5.542e-01  8.787e-01  -0.631 0.528254    
## rs_june7                  1.186e+00  3.623e-01   3.274 0.001062 ** 
## rs_june8                  1.159e+01  1.099e+02   0.105 0.916025    
## rs_may-1                 -1.599e-01  5.103e-02  -3.134 0.001726 ** 
## rs_may0                  -1.940e-01  5.015e-02  -3.869 0.000109 ***
## rs_may2                   4.508e-01  6.033e-02   7.473 7.82e-14 ***
## rs_may3                   6.831e-01  1.607e-01   4.252 2.12e-05 ***
## rs_may4                   6.373e-01  2.398e-01   2.657 0.007882 ** 
## rs_may5                   9.030e-01  4.049e-01   2.230 0.025726 *  
## rs_may6                  -6.403e-01  1.715e+00  -0.373 0.708807    
## rs_may7                   1.054e+00  3.624e-01   2.907 0.003649 ** 
## rs_may8                   4.982e+00  1.471e+02   0.034 0.972982    
## rs_apr-1                 -4.350e-02  4.571e-02  -0.952 0.341298    
## rs_apr0                  -2.829e-01  4.538e-02  -6.234 4.53e-10 ***
## rs_apr2                   3.814e-01  5.526e-02   6.901 5.17e-12 ***
## rs_apr3                   1.213e+00  1.622e-01   7.481 7.36e-14 ***
## rs_apr4                   2.918e-01  2.858e-01   1.021 0.307374    
## rs_apr5                   1.128e+00  6.102e-01   1.849 0.064423 .  
## rs_apr6                   1.187e+00  4.497e-01   2.639 0.008317 ** 
## rs_apr7                   8.686e-01  3.820e-01   2.274 0.022977 *  
## rs_apr8                   1.022e+01  1.130e+02   0.090 0.927931    
## bs_sep                   -3.144e-06  8.363e-07  -3.759 0.000171 ***
## bs_aug                    3.238e-06  1.110e-06   2.916 0.003544 ** 
## bs_july                   1.952e-06  1.041e-06   1.875 0.060778 .  
## bs_june                   3.817e-07  1.083e-06   0.353 0.724399    
## bs_may                   -1.001e-06  1.267e-06  -0.790 0.429612    
## bs_apr                    1.266e-06  1.016e-06   1.246 0.212662    
## ap_sep                   -1.622e-05  1.844e-06  -8.795  < 2e-16 ***
## ap_aug                   -1.160e-05  1.610e-06  -7.204 5.83e-13 ***
## ap_july                  -4.031e-06  1.427e-06  -2.824 0.004743 ** 
## ap_june                  -4.994e-06  1.501e-06  -3.327 0.000879 ***
## ap_may                   -7.142e-06  1.471e-06  -4.855 1.20e-06 ***
## ap_apr                   -3.449e-06  1.095e-06  -3.150 0.001635 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 63397  on 46416  degrees of freedom
## Residual deviance: 47819  on 46338  degrees of freedom
## AIC: 47977
## 
## Number of Fisher Scoring iterations: 11
get_cm <- function(predictions, model, co, g_truth)
{
  # pred.list <- round(predictions) # for now
  pred.list <- predictions
  pred.list[predictions <= co] <- 1
  pred.list[predictions > co] <- 2
  cm <- matrix(0, nrow=2, ncol=2)
  for(i in 1:length(pred.list))
  {
    actual <- g_truth[i]
    pred <- pred.list[i]
    cm[pred, actual] <- cm[pred, actual] + 1
  }
  colnames(cm) <- c('Act.Pos', 'Act.Neg')
return(cm)
}

get_evalmetrics <- function(cm, model, n_preds)
{
  precision <- cm[1, 1]/(cm[1, 1] + cm[1, 2])
  recall <- cm[1, 1]/(cm[1, 1] + cm[2, 1])
  sensitivity <- recall
  specificity <- cm[2, 2]/(cm[2, 2] + cm[1, 2])
  f1 <- 2 * ((precision * recall)/(precision + recall))
  mcfadden <- 1 - model$deviance/model$null.deviance
  accuracy <- (cm[1, 1] + cm[2, 2])/n_preds
  metrics <- c(precision, recall, sensitivity, specificity, f1, mcfadden, accuracy)
  
  return(metrics)
}
fm.y_prob <- predict.glm(full_model, newdata=data, type=c("response"))
tmp_cm <- get_cm(fm.y_prob, full_model, 0.5, data$default_r)
tmp_metrics <- get_evalmetrics(tmp_cm, full_model, length(fm.y_prob))
tmp_cm
##      Act.Pos Act.Neg
## [1,]   23109    7575
## [2,]    3415   12318
tmp_metrics
##   Act.Pos   Act.Pos   Act.Pos   Act.Neg   Act.Pos             Act.Pos 
## 0.7531287 0.8712487 0.8712487 0.6192128 0.8078940 0.2457158 0.7632333
exp_coeff <- exp(full_model$coefficients)
exp_coeff
##              (Intercept)             credit_limit             genderFemale 
##             3.009243e-01             9.999979e-01             6.954767e-01 
##     educationHigh School educationGraduate School      educationUniversity 
##             4.257172e+00             3.601965e+00             3.419274e+00 
##           maritalMarried            maritalOthers                      age 
##             1.188459e+00             1.690477e+00             1.002139e+00 
##                 rs_sep-1                  rs_sep0                  rs_sep1 
##             1.477527e+00             9.163422e-01             1.687897e+00 
##                  rs_sep2                  rs_sep3                  rs_sep4 
##             5.993234e+00             5.522436e+00             6.162932e+00 
##                  rs_sep5                  rs_sep6                  rs_sep7 
##             2.891303e+00             8.498445e-01             1.850197e+00 
##                  rs_sep8                 rs_aug-1                  rs_aug0 
##             5.466350e+00             9.018615e-01             7.341167e-01 
##                  rs_aug1                  rs_aug2                  rs_aug3 
##             3.594014e-01             1.511536e+00             1.469680e+00 
##                  rs_aug4                  rs_aug5                  rs_aug6 
##             1.031756e+00             9.612567e-01             2.415572e+00 
##                  rs_aug7                  rs_aug8                rs_july-1 
##             1.846198e+00             3.181072e-07             9.012790e-01 
##                 rs_july0                 rs_july1                 rs_july2 
##             8.858127e-01             3.652197e-06             1.492451e+00 
##                 rs_july3                 rs_july4                 rs_july5 
##             1.645091e+00             1.465233e+00             9.593516e-01 
##                 rs_july6                 rs_july7                 rs_july8 
##             2.609069e+00             3.127883e+00             8.288328e-03 
##                rs_june-1                 rs_june0                 rs_june1 
##             8.298782e-01             8.158909e-01             1.110380e+06 
##                 rs_june2                 rs_june3                 rs_june4 
##             1.544884e+00             1.762782e+00             2.029451e+00 
##                 rs_june5                 rs_june6                 rs_june7 
##             1.108020e+00             5.745553e-01             3.274081e+00 
##                 rs_june8                 rs_may-1                  rs_may0 
##             1.076623e+05             8.522213e-01             8.236293e-01 
##                  rs_may2                  rs_may3                  rs_may4 
##             1.569624e+00             1.980032e+00             1.891351e+00 
##                  rs_may5                  rs_may6                  rs_may7 
##             2.466964e+00             5.271105e-01             2.868097e+00 
##                  rs_may8                 rs_apr-1                  rs_apr0 
##             1.457459e+02             9.574337e-01             7.535849e-01 
##                  rs_apr2                  rs_apr3                  rs_apr4 
##             1.464261e+00             3.364797e+00             1.338798e+00 
##                  rs_apr5                  rs_apr6                  rs_apr7 
##             3.090482e+00             3.276317e+00             2.383550e+00 
##                  rs_apr8                   bs_sep                   bs_aug 
##             2.746195e+04             9.999969e-01             1.000003e+00 
##                  bs_july                  bs_june                   bs_may 
##             1.000002e+00             1.000000e+00             9.999990e-01 
##                   bs_apr                   ap_sep                   ap_aug 
##             1.000001e+00             9.999838e-01             9.999884e-01 
##                  ap_july                  ap_june                   ap_may 
##             9.999960e-01             9.999950e-01             9.999929e-01 
##                   ap_apr 
##             9.999966e-01

Based on the intercept value 0.3009243 a person is less likely to default on average.